# load packages into the working library library(devtools) library(cytoMEM) library(RAPID) library(tidyverse) library(ggpubr) library(flowCore) library(Biobase) library(FlowSOM) library(survival) library(survminer) library(plyr) library(Rtsne)
# number of subsamplings and t-SNE analyses to run NUMBER_OF_TESTS = 10 # number of cells per file to subsample NUMBER_OF_CELLS_TO_SAMPLE_PER_FILE = 600 # read data into R setwd(paste(getwd(),"/data_files/davis_dataset", sep = "")) data.set <- dir(pattern="*.fcs") data.lists <- lapply(lapply(data.set,read.FCS),exprs) my.data = as.data.frame(do.call(rbind, mapply(cbind, data.lists, "File ID" = c(1:length(data.lists)), SIMPLIFY = F))) colnames(my.data)[1:length(my.data)-1 ] <- as.character(read.FCS (data.set[[1]])@parameters@data[["desc"]]) # create varible for survival or clinical data OS.data.set <- dir(pattern="*.csv") OS.data = read.csv(OS.data.set)
# create output directory dir.create(file.path(getwd(), "output files"), showWarnings = FALSE) data_for_RMSD<-list() for (q in 1:NUMBER_OF_TESTS){ # equally sample the data files.to.sample = split(my.data,my.data$`File ID`) sampled.data <- list() for (i in 1: length(files.to.sample)){ if (nrow(files.to.sample[[i]])>NUMBER_OF_CELLS_TO_SAMPLE_PER_FILE){ sample.df = files.to.sample[[i]] sampled.data[[i]] = as.data.frame(sample.df[sample(nrow(sample.df), NUMBER_OF_CELLS_TO_SAMPLE_PER_FILE), ])} else{ sampled.data[[i]] = files.to.sample[[i]]}} my.sampled.data = as.data.frame(do.call(rbind, sampled.data)) to.tsne.data <- my.sampled.data %>% mutate_all(function(x) asinh(x/5)) %>% select(contains('(v)')) # run t-SNE analysis mytSNE = Rtsne( to.tsne.data, dims = 2, initial_dims = length(to.tsne.data), perplexity = 30, check_duplicates = FALSE, max_iter = 1000, verbose = TRUE) tsne.data = as.data.frame(mytSNE$Y) # find ideal number of clusters optimized.cluster.data = optimize_FlowSOM_clusters(tsne.data,to.tsne.data, N = 50, seed = 38) test.data = cbind(my.sampled.data,tsne.data,optimized.cluster.data) colnames(test.data)[(ncol(test.data)-2):ncol(test.data)]<- c("tSNE1","tSNE2","cluster") run.number = q ideal.cluster.number = max(test.data$`cluster`) write.csv(test.data,paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"all_data_with_tSNE_and_clusters.csv")) mat.input <- as.matrix(tsne.data) metadata <- data.frame(name = dimnames(mat.input)[[2]], desc = dimnames(mat.input)[[2]]) metadata$range <- apply(apply(mat.input, 2, range), 2, diff) metadata$minRange <- apply(mat.input, 2, min) metadata$maxRange <- apply(mat.input, 2, max) input.flowframe <- new("flowFrame", exprs=mat.input,parameters = AnnotatedDataFrame(metadata)) flowSOM.clusters <- list() for (k in 1:100) { fSOM <- FlowSOM(input.flowframe, scale = TRUE, colsToUse = c(1:length(tsne.data)), nClus= ideal.cluster.number) flowSOM.cluster <- as.matrix(fSOM[[2]][fSOM[[1]]$map$mapping[,1]]) flowSOM.clusters[[k]] <- as.numeric(as.vector(flowSOM.cluster))} true.clusters = optimized.cluster.data # find stable clusters f.table <- list() max.fmeasure <- list() for (i in 1:length(flowSOM.clusters)){ test.clusters = flowSOM.clusters[[i]] a <- table(true.clusters, test.clusters); p <- t(apply(a,1,function(x)x/colSums(a))) r <- apply(a,2,function(x)x/rowSums(a)) f <- 2*r*p / (r+p) f[is.na(f)] <- 0 f.table[[i]] <- f max.fmeasure[[i]] = apply(f,1,max) sum(apply(f,1,max) * (rowSums(a)/sum(a)))} all.clusters = plyr::ldply(flowSOM.clusters,rbind) saveRDS(all.clusters,paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"flowSOM_clusters.RDS")) all.max.fmeasures = plyr::ldply(max.fmeasure,rbind) avg.fmeasure = apply(all.max.fmeasures,2,mean) stable.clusters = as.numeric(as.vector(rownames(as.data.frame(avg.fmeasure[avg.fmeasure > 0.5])))) cells.in.stable.clusters = subset(test.data, test.data$`cluster` %in% stable.clusters) cluster_and_patient_df = as.data.frame(cbind(true.clusters,test.data$`File ID`)) cluster_and_patient_df[,1] <- as.factor(cluster_and_patient_df[,1]) cluster_and_patient_df[,2] <- as.factor(cluster_and_patient_df[,2]) patient.subsets <- split(cluster_and_patient_df,cluster_and_patient_df[,2]) subset.abundance <- list() for (j in 1:length(patient.subsets)){ subset.abundance[[j]] = (summary(patient.subsets[[j]][,1]))*100/nrow(patient.subsets[[j]])} all.subset.abundances = plyr::ldply(subset.abundance,rbind) all.subset.abundances[is.na(all.subset.abundances)]<-0 single.subset.abundance.data <- list() for(d in 1:ncol(all.subset.abundances)){ single.subset.abundance.data[[d]] = all.subset.abundances[,c(d)]} abundance.IQR <- list() for (b in 1:ncol(all.subset.abundances)){ abundance.IQR[[b]] = IQR(all.subset.abundances[,c(b)])} Abundance_IQR = as.vector(unlist(abundance.IQR)) rownames(all.subset.abundances)<- c(1:nrow(all.subset.abundances)) abundance.groups <- list() for(e in 1:ncol(all.subset.abundances)){ abundance.groups[[e]] <- (single.subset.abundance.data[[e]]>abundance.IQR[[e]]) abundance.groups[[e]][abundance.groups[[e]]==TRUE] = 1 names(abundance.groups)[[e]]<- colnames(all.subset.abundances)[e]} # 1 = high, 0 = low # survival analysis cox.summary<- list() count = 1 high.low.groups <- list() low.median = vector() high.median = vector() GNP <- vector() GPP <- vector() GNPstats <- list() GPPstats <- list() countgnp = 0 countgpp = 0 for (s in 1:length(stable.clusters)){ Group <- factor(abundance.groups[[paste0(stable.clusters[s],sep = "")]], levels = c(0,1), labels = c("Low", "High")) survival_data1 = cbind(OS.data,Group) high.low.groups[[s]] = split(survival_data1,survival_data1$Group) low.median[[s]] = median(high.low.groups[[s]][["Low"]][[2]]) high.median[[s]] = median(high.low.groups[[s]][["High"]][[2]]) model2 <- coxph(Surv(OS.data$Time, OS.data$Status) ~ Group, data=survival_data1) cox.summary[[stable.clusters[s]]] = summary(model2) survival_stat = cox.summary[[stable.clusters[s]]]$coefficients CI = cox.summary[[stable.clusters[s]]]$conf.int if (survival_stat[,5] <= 0.05 & survival_stat[,2] <= 1){ print(paste("Positive Prognostic Subset #", stable.clusters[s])) countgpp = countgpp + 1 GPP[countgpp] = stable.clusters[s] GPPstats[[countgpp]] = c(survival_stat[,5],survival_stat[,2],CI[,c(3:4)])} if (survival_stat[,5] <= 0.05 & survival_stat[,2] >= 1){ print(paste("Negative Prognostic Subset #",stable.clusters[s])) countgnp = countgnp + 1 GNP[countgnp] = stable.clusters[s] GNPstats[[countgnp]] = c(survival_stat[,5],survival_stat[,2],CI[,c(3:4)])}} significant.clusters = c(GNP,GPP) Median_High_Group = high.median Median_Low_Group = low.median all.abundance.data = rbind(all.subset.abundances,Abundance_IQR, Median_High_Group,Median_Low_Group,avg.fmeasure) colnames(all.abundance.data) <- paste(run.number,"_","Subset_", colnames(all.subset.abundances), sep = "") rownames(all.abundance.data)[(nrow(all.abundance.data)-3):(nrow(all.abundance.data))]<-c("Abundance_IQR", "Median_High_Group","Median_Low_Group","F-measure") write.csv(all.abundance.data, paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"_subset_abundances_withIQRandfmeasure.csv",sep="")) # run MEM on significant pops MEMdata = to.tsne.data cluster = true.clusters MEM.data = cbind(MEMdata,cluster) MEM.values.p = MEM(MEM.data, transform = FALSE, cofactor = 0, choose.markers = FALSE, markers = "1,3:4,6:10,12:13,15:16,18:23,25:27,30:31",choose.ref = FALSE, zero.ref = FALSE, rename.markers = FALSE, new.marker.names = "CD45,CD19,CD22,tIKAROS,CD79b,CD20,CD34,CD179a,CD123,IGMi,CD10,CD179b,CD24,TSLPR,CD127,RAG1,TDT,PAX5,CD43,CD38,CD58,HLA-DR,IGMs",file.is.clust = FALSE, add.fileID = FALSE, IQR.thresh = NULL) MEM.values.s = MEM(MEM.data, transform = FALSE, cofactor = 0, choose.markers = FALSE, markers = "2,5,11,14,17,24,28:29,32",choose.ref = FALSE, zero.ref = FALSE, rename.markers = FALSE, new.marker.names = "p-PLCg,p-4EBP1,p-STAT5,p-IKAROS,p-AKT,p-Syk,p-S6,p-ERK,p-CREB",file.is.clust = FALSE, add.fileID = FALSE, IQR.thresh = NULL) MEM_Matrix = cbind(MEM.values.p$MEM_matrix[[1]],MEM.values.s$MEM_matrix[[1]]) MEM_for_RMSD = subset(MEM_Matrix, rownames(MEM_Matrix) %in% significant.clusters) rownames(MEM_for_RMSD)<-paste0(run.number,"_",rownames(MEM_for_RMSD),sep = "") ordered.MEM = as.data.frame(MEM_for_RMSD[order(row.names(MEM_for_RMSD)),,drop = FALSE ]) gnpstats = do.call(rbind,GNPstats) if (length(gnpstats)!=0) { colnames(gnpstats)[1:2] <- c("p", "HR") rownames(gnpstats) <- c(GNP) } else{ gnpstats = c(1, 0) } gppstats = do.call(rbind, GPPstats) if (length(gppstats) != 0) { colnames(gppstats)[1:2] <- c("p", "HR") rownames(gppstats) <- c(GPP) } else { gppstats = c(1, 0) } combined.stats = rbind(gnpstats,gppstats) if((gnpstats == c(1,0))|(gppstats == c(1,0))){ combined.stats = combined.stats[-c(which(combined.stats[,1]==1)),,drop = FALSE]} rownames(combined.stats)<-paste0(run.number,"_",rownames(combined.stats),sep = "") ordered.data = as.data.frame(combined.stats[order(row.names(combined.stats)),,drop = FALSE ]) all.significant.cluster.data = cbind(ordered.data,ordered.MEM) write.csv(all.significant.cluster.data, paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"_siginificant_stable_cluster_data.csv",sep="")) data_for_RMSD[[q]]<- as.data.frame(all.significant.cluster.data) } dataRMSD = do.call(rbind,data_for_RMSD) to.RMSD = as.matrix(dataRMSD[-c(1:4)]) write.csv(to.RMSD,paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"_RMSD_input_data.csv",sep="")) # run RMSD and make heatmap for significant pops from all runs pdf(paste("./output files/Run ",run.number,"_",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"_RMSD_heatmap.pdf",sep=""),width = 20, height =20) rmsd <- MEM_RMSD(to.RMSD,format = NULL, output.matrix = TRUE,newWindow.heatmaps = FALSE) dev.off() # print session information sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.